import numpy as np
import pandas as pd
books = pd.read_csv('books_data.csv')
books.head()
| Title | description | authors | image | previewLink | publisher | publishedDate | infoLink | categories | ratingsCount | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Its Only Art If Its Well Hung! | NaN | ['Julie Strain'] | http://books.google.com/books/content?id=DykPA... | http://books.google.nl/books?id=DykPAAAACAAJ&d... | NaN | 1996 | http://books.google.nl/books?id=DykPAAAACAAJ&d... | ['Comics & Graphic Novels'] | NaN |
| 1 | Dr. Seuss: American Icon | Philip Nel takes a fascinating look into the k... | ['Philip Nel'] | http://books.google.com/books/content?id=IjvHQ... | http://books.google.nl/books?id=IjvHQsCn_pgC&p... | A&C Black | 2005-01-01 | http://books.google.nl/books?id=IjvHQsCn_pgC&d... | ['Biography & Autobiography'] | NaN |
| 2 | Wonderful Worship in Smaller Churches | This resource includes twelve principles in un... | ['David R. Ray'] | http://books.google.com/books/content?id=2tsDA... | http://books.google.nl/books?id=2tsDAAAACAAJ&d... | NaN | 2000 | http://books.google.nl/books?id=2tsDAAAACAAJ&d... | ['Religion'] | NaN |
| 3 | Whispers of the Wicked Saints | Julia Thomas finds her life spinning out of co... | ['Veronica Haddon'] | http://books.google.com/books/content?id=aRSIg... | http://books.google.nl/books?id=aRSIgJlq6JwC&d... | iUniverse | 2005-02 | http://books.google.nl/books?id=aRSIgJlq6JwC&d... | ['Fiction'] | NaN |
| 4 | Nation Dance: Religion, Identity and Cultural ... | NaN | ['Edward Long'] | NaN | http://books.google.nl/books?id=399SPgAACAAJ&d... | NaN | 2003-03-01 | http://books.google.nl/books?id=399SPgAACAAJ&d... | NaN | NaN |
books.shape
(212404, 10)
books = books.drop(['image', 'previewLink', 'infoLink'], axis = 1, inplace = False)
print(books.shape)
books.head()
(212404, 7)
| Title | description | authors | publisher | publishedDate | categories | ratingsCount | |
|---|---|---|---|---|---|---|---|
| 0 | Its Only Art If Its Well Hung! | NaN | ['Julie Strain'] | NaN | 1996 | ['Comics & Graphic Novels'] | NaN |
| 1 | Dr. Seuss: American Icon | Philip Nel takes a fascinating look into the k... | ['Philip Nel'] | A&C Black | 2005-01-01 | ['Biography & Autobiography'] | NaN |
| 2 | Wonderful Worship in Smaller Churches | This resource includes twelve principles in un... | ['David R. Ray'] | NaN | 2000 | ['Religion'] | NaN |
| 3 | Whispers of the Wicked Saints | Julia Thomas finds her life spinning out of co... | ['Veronica Haddon'] | iUniverse | 2005-02 | ['Fiction'] | NaN |
| 4 | Nation Dance: Religion, Identity and Cultural ... | NaN | ['Edward Long'] | NaN | 2003-03-01 | NaN | NaN |
import matplotlib.pyplot as plt
# Extract year from publishedDate
books['publishedYear'] = books['publishedDate'].str.extract(r'(\d{4})')
# Drop rows where publishedYear is NaN (optional, depending on how you want to handle missing years)
books = books.dropna(subset=['publishedYear'])
# Convert the extracted year to an integer
books['publishedYear'] = books['publishedYear'].astype(int)
# Display the DataFrame to check the new 'publishedYear' column
books.head()
| Title | description | authors | publisher | publishedDate | categories | ratingsCount | publishedYear | |
|---|---|---|---|---|---|---|---|---|
| 5 | The Church of Christ: A Biblical Ecclesiology ... | In The Church of Christ: A Biblical Ecclesiolo... | ['Everett Ferguson'] | Wm. B. Eerdmans Publishing | 1996 | ['Religion'] | 5.0 | 1996 |
| 31 | Voices from the Farm: Adventures in Community ... | Twenty-five years ago, at the height of the co... | ['Rupert Fike'] | Book Publishing Company | 2012-08-21 | ['Biography & Autobiography'] | 1.0 | 2012 |
| 33 | The Battleship Bismarck | The Bismarck is perhaps the most famous – and ... | ['Stefan Draminski'] | Bloomsbury Publishing | 2018-09-20 | ['History'] | 1.0 | 2018 |
| 42 | Tess and the Highlander | In 1543, on a windswept isle off of Scotland, ... | ['May Mcgoldrick'] | Harper Collins | 2002-11 | ['Juvenile Fiction'] | 2.0 | 2002 |
| 43 | Beginner's Yoruba (Hippocrene Beginner's Series) | "Beginner's Yoruba" is now available with two ... | ['Kayode J. Fakinlede'] | Hippocrene Books | 2005 | ['Foreign Language Study'] | 1.0 | 2005 |
books.drop('publishedDate', axis=1, inplace=True)
books.head()
| Title | description | authors | publisher | categories | ratingsCount | publishedYear | |
|---|---|---|---|---|---|---|---|
| 5 | The Church of Christ: A Biblical Ecclesiology ... | In The Church of Christ: A Biblical Ecclesiolo... | ['Everett Ferguson'] | Wm. B. Eerdmans Publishing | ['Religion'] | 5.0 | 1996 |
| 31 | Voices from the Farm: Adventures in Community ... | Twenty-five years ago, at the height of the co... | ['Rupert Fike'] | Book Publishing Company | ['Biography & Autobiography'] | 1.0 | 2012 |
| 33 | The Battleship Bismarck | The Bismarck is perhaps the most famous – and ... | ['Stefan Draminski'] | Bloomsbury Publishing | ['History'] | 1.0 | 2018 |
| 42 | Tess and the Highlander | In 1543, on a windswept isle off of Scotland, ... | ['May Mcgoldrick'] | Harper Collins | ['Juvenile Fiction'] | 2.0 | 2002 |
| 43 | Beginner's Yoruba (Hippocrene Beginner's Series) | "Beginner's Yoruba" is now available with two ... | ['Kayode J. Fakinlede'] | Hippocrene Books | ['Foreign Language Study'] | 1.0 | 2005 |
# Count the number of books published per year
books_per_year = books['publishedYear'].value_counts().sort_index()
books_per_year
| count | |
|---|---|
| publishedYear | |
| 1776 | 3 |
| 1823 | 1 |
| 1852 | 1 |
| 1861 | 1 |
| 1869 | 1 |
| ... | ... |
| 2019 | 604 |
| 2020 | 648 |
| 2021 | 618 |
| 2022 | 241 |
| 2023 | 3 |
142 rows × 1 columns
# Plot the number of books published per year using a bar chart
plt.figure(figsize=(14, 6))
plt.bar(books_per_year.index, books_per_year.values, color='skyblue')
plt.title('Number of Books Published Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Books')
# Set x-ticks to be every 10 years
start_year = books_per_year.index.min()
end_year = books_per_year.index.max()
plt.xticks(np.arange(start_year, end_year + 1, 10), rotation=45)
plt.grid(True, axis='y')
plt.show()
books.dropna(inplace=True)
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
def clean_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize the text
tokens = word_tokenize(text)
# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Rejoin the tokens into a single string
cleaned_text = ' '.join(tokens)
return cleaned_text
books['description'] = books['description'].apply(lambda x: clean_text(str(x)) if pd.notnull(x) else x)
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Downloading package omw-1.4 to /root/nltk_data... [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
# Display the cleaned data
books[['Title', 'description']].head()
| Title | description | |
|---|---|---|
| 5 | The Church of Christ: A Biblical Ecclesiology ... | in the church of christ a biblical ecclesiolog... |
| 31 | Voices from the Farm: Adventures in Community ... | twentyfive year ago at the height of the count... |
| 33 | The Battleship Bismarck | the bismarck is perhaps the most famous – and ... |
| 42 | Tess and the Highlander | in 1543 on a windswept isle off of scotland se... |
| 43 | Beginner's Yoruba (Hippocrene Beginner's Series) | beginner yoruba is now available with two acco... |
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# Preprocess the 'description' column to handle NaN values
books['description'] = books['description'].fillna('')
books['Title'] = books['Title'].fillna('')
# Combine all titles into one string for the word cloud
title_text = ' '.join(books['Title'])
# Combine all descriptions into one string for the word cloud
description_text = ' '.join(books['description'])
# Generate the word cloud for the Title column
title_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(title_text)
# Generate the word cloud for the Description column
description_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(description_text)
# Plot the word cloud for Titles
plt.figure(figsize=(10, 5))
plt.imshow(title_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Titles')
plt.axis('off')
plt.show()
# Plot the word cloud for Descriptions
plt.figure(figsize=(10, 5))
plt.imshow(description_wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Descriptions')
plt.axis('off')
plt.show()
import matplotlib.pyplot as plt
# Get the top 10 most frequent publishers
top_publishers = books['publisher'].value_counts().head(50)
# Create the bar chart
plt.figure(figsize=(10, 6))
top_publishers.plot(kind='bar')
plt.title('Top 50 Most Frequent Publishers')
plt.xlabel('Publishers')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
from collections import Counter
category_counts = Counter([item.strip() for sublist in books['categories'].dropna().str[1:-1].str.replace("'", "").str.split(', |& ') for item in sublist])
category_counts_df = pd.DataFrame(category_counts.items(), columns=['Category', 'Count']).sort_values(by='Count', ascending=False)
category_counts_df.head(10)
| Category | Count | |
|---|---|---|
| 14 | Fiction | 11149 |
| 4 | Juvenile Fiction | 3346 |
| 0 | Religion | 2902 |
| 3 | History | 2465 |
| 1 | Biography | 2312 |
| 2 | Autobiography | 2309 |
| 21 | Business | 1064 |
| 22 | Economics | 1061 |
| 23 | Juvenile Nonfiction | 1019 |
| 39 | Computers | 881 |
import re
from collections import Counter
# Function to split by commas but neglect commas inside quotes and handle suffixes
def split_authors(authors_str):
if authors_str:
# Use regex to find and split by commas outside of quotes
return re.split(r',\s*(?!\s*(Jr\.|Sr\.|Ph\.D\.|M\.D\.|D\.D\.S\.|Inc\.|LLC\.|Ltd\.)$)', authors_str)
return []
# Apply the splitting function, filtering out irrelevant entities like "Inc.", "PhD", etc.
def clean_authors(author_list):
ignore_list = {'Inc', 'PhD', 'LLC', 'Ltd', 'M.D.', 'D.D.S.', 'Incorporated'}
return [author for author in author_list if author not in ignore_list]
# Process and count authors
author_counts = Counter([
item.strip()
for sublist in books['authors'].dropna().str[1:-1].str.replace("'", "").apply(split_authors)
for item in clean_authors(sublist) if item
])
# Convert to DataFrame for easier analysis
author_counts_df = pd.DataFrame(author_counts.items(), columns=['Author', 'Count']).sort_values(by='Count', ascending=False).reset_index(drop=True)
author_counts_df
| Author | Count | |
|---|---|---|
| 0 | "Louis LAmour" | 98 |
| 1 | Agatha Christie | 56 |
| 2 | Nora Roberts | 48 |
| 3 | Georgette Heyer | 43 |
| 4 | William Shakespeare | 40 |
| ... | ... | ... |
| 32593 | Iris Rainer Dart | 1 |
| 32594 | Richard Hunt | 1 |
| 32595 | Henry Justice Ford | 1 |
| 32596 | Dave Grohl | 1 |
| 32597 | Elvira Woodruff | 1 |
32598 rows × 2 columns
# Create a bar plot for top 25 authors
author_counts_df.head(25).plot(kind='bar', x='Author', y='Count', legend=False, figsize=(12, 5))
plt.xlabel('Author')
plt.ylabel('Count')
plt.title('Top 25 Authors')
plt.xticks(rotation=45, ha='right')
plt.show()
# Create a pie chart for top 15 categories
plt.figure(figsize=(10, 5))
plt.pie(category_counts_df.head(15)['Count'], labels=category_counts_df.head(15)['Category'], autopct='%1.1f%%')
plt.title('Top 15 Categories')
plt.tight_layout()
plt.show()
from nbconvert import HTMLExporter
import nbformat
# Load the notebook
notebook_path = '/content/Books_data_EDA.ipynb' # Replace with the path to your notebook
with open(notebook_path, 'r') as notebook_file:
notebook_content = notebook_file.read()
# Convert the notebook to HTML
notebook_node = nbformat.reads(notebook_content, as_version=4)
html_exporter = HTMLExporter()
body, resources = html_exporter.from_notebook_node(notebook_node)
# Save the HTML file
html_path = '/content/Books_data_EDA.html' # Path to save the HTML file
with open(html_path, 'w') as html_file:
html_file.write(body)
print(f"HTML file saved as {html_path}")
HTML file saved as /content/Books_data_EDA.html